Multivariate Prophet Model

Predicting COVID-19 Deaths (STAT 390)

Author

Ryan Nguyen

1. Import Libraries & Set Seed

Code
library(forecast)
library(tseries)
library(tidyverse)
library(tidymodels)
library(kableExtra)
library(modeltime)
library(prophet)
library(timetk)
library(Metrics)
library(zoo)
library(MLmetrics)

set.seed(123)

2. Pull Data

Code
selected_death <- read_csv("data/finaldata_selectedfeatures.csv")

3. Data Processing

Code
selected_deaths <- selected_death  %>% 
  rename(ds = date,
         y = covid_19_deaths) %>% 
  select(-c(starts_with("deaths_"), "dayofweek", "quarter", "month", "dayofyear", "dayofmonth",
            "weekofyear"))

# LOCF 
selected_deaths <- fill(selected_deaths, everything())

selected_deaths <- replace(selected_deaths, is.na(selected_deaths), 100000000)

naniar::miss_var_summary(selected_deaths)
# A tibble: 67 × 3
   variable                                n_miss pct_miss
   <chr>                                    <int>    <dbl>
 1 ds                                           0        0
 2 y                                            0        0
 3 administered_5plus                           0        0
 4 distributed_novavax                          0        0
 5 series_complete_moderna_65plus               0        0
 6 administered_dose1_recip_18plus              0        0
 7 additional_doses_12plus                      0        0
 8 administered_dose1_recip_65plus              0        0
 9 administered_dose1_recip_12plus              0        0
10 administered_dose1_recip_18plus_pop_pct      0        0
# ℹ 57 more rows
Code
selected_deaths <- selected_deaths %>% 
  mutate(y = case_when((y == 0) ~ 1,
                       .default = y))

# selected_deaths <- selected_deaths %>% 
#   mutate(y = log(y))

4. Train Test Split

Code
#splits <- time_series_split(selected_deaths, initial = "148 weeks", assess = "17 weeks")
#train_data <- training(splits)
#test_data <- testing(splits)

train_data <- selected_deaths %>% 
  filter(year != 2023) %>% 
  select(-c(year))

test_data <- selected_deaths %>% 
  filter(year == 2023) %>% 
  select(-c(year))

5. Baseline Model

5.1 Build Baseline Model

Code
# Use default parameters to initiate Prophet Model
# Fit model on training set
baseline_model <- prophet(train_data)

5.2 Baseline Model Forecast

Code
# Create time range for forecast
future_baseline <- make_future_dataframe(baseline_model, freq = "week", periods = 8)

# Make prediction
forecast_baseline <- predict(baseline_model, future_baseline)

# Visualize the forecast
plot(baseline_model, forecast_baseline) +
  labs(title = "Baseline Prophet Model")

Code
prophet_plot_components(baseline_model, forecast_baseline)

5.3 Baseline Model Performance

Code
# Merge actual and predicted values
forecast_baseline1 <- predict(baseline_model, selected_deaths)

forecast_baseline2 <- forecast_baseline1 %>%
  filter(ds > as.POSIXct("2023-01-01"))  %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>%
  mutate(index = seq(1:332))

performance_baseline <- full_join(test_data, forecast_baseline2, by = join_by(ds == ds, index == index)) %>% 
  select(ds, y, yhat, index)
# 
# performance_baseline <- performance_baseline %>% 
#   mutate(y = exp(y),
#          yhat = exp(yhat))

ggplot(performance_baseline) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  #geom_point(aes(x = index, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  #geom_point(aes(x = index, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Baseline Prophet Test Set", 
       y = "Deaths",
       x = "Dates")

Code
# Check MAE value
baseline_mae <- mae(performance_baseline$y, performance_baseline$yhat)
print(paste("The MAE for the baseline model is", baseline_mae))
[1] "The MAE for the baseline model is 109.395757854377"
Code
# Check MASE value
baseline_mase <- mase(performance_baseline$y, performance_baseline$yhat)
print(paste("The MASE for the baseline model is", baseline_mase))
[1] "The MASE for the baseline model is 1.79097813086353"

6. Tuning Changepoints

6.1 Plot Default Changepoints

Code
plot(baseline_model, forecast_baseline) + add_changepoints_to_plot(baseline_model)

Code
# Create prophet model with CI of 95%
model_changepoint <- prophet(train_data, interval.width = 0.95, n.changepoints = 3)

6.2 New Changepoints Model Forecast

Code
# Create time range for forecast
future_changepoint <- make_future_dataframe(model_changepoint, freq = "week", periods = 8)

# Make prediction
forecast_changepoint <- predict(model_changepoint, future_changepoint)

# Visualize the forecast
plot(model_changepoint, forecast_changepoint) +
  labs(title = "Changepoint Prophet Model")

Code
# Visualize forecast components
prophet_plot_components(model_changepoint, forecast_changepoint)

Code
# Change points to plot
plot(model_changepoint, forecast_changepoint) + add_changepoints_to_plot(model_changepoint)

Code
# Merge actual and predicted values
forecast_changepoint2 <- forecast_changepoint %>%
  filter(ds > as.POSIXct("2023-01-01"))

performance_changepoint <- full_join(test_data, forecast_changepoint2, by = join_by(ds == ds)) %>%
  select(ds, y, yhat)

# performance_changepoint <- performance_changepoint %>% 
#   mutate(y = exp(y),
#          yhat = exp(yhat))

ggplot(performance_changepoint) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  #geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  #geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Changepoint Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
changepoint_mae <- mae(performance_changepoint$y, performance_changepoint$yhat)
print(paste("The MAE for the changepoint model is", changepoint_mae))
[1] "The MAE for the changepoint model is 80.6968304982461"
Code
# Check MASE value
changepoint_mase <- mase(performance_changepoint$y, performance_changepoint$yhat)
print(paste("The MASE for the changepoint model is", changepoint_mase))
[1] "The MASE for the changepoint model is 1.32113220372537"

7. Add Seasonality to Baseline Model

7.1 Build Model with Seasonality

Code
# Add seasonality and fit to training set
model_season <- prophet(train_data,
                        yearly.seasonality = TRUE,
                        weekly.seasonality = TRUE)

7.2 Seasonality Model Forecast

Code
# Create time range for forecast
future_season <- make_future_dataframe(model_season, freq = "week", periods = 8)

# Make prediction
forecast_season <- predict(model_season, future_season)

# Visualize the forecast
plot(model_season, forecast_season) +
  labs(title = "Seasonal Prophet Model")

Code
prophet_plot_components(model_season, forecast_season)

7.3 Seasonality Model Performance

Code
# Merge actual and predicted values
forecast_season2 <- forecast_season %>%
  filter(ds > as.POSIXct("2023-01-01"))

performance_season <- full_join(test_data, forecast_season2, by = join_by(ds == ds)) %>% 
  select(ds, y, yhat)

# performance_season <- performance_season %>% 
#   mutate(y = exp(y),
#          yhat = exp(yhat))

ggplot(performance_season) +
 geom_line(aes(x = ds, y = y, color = "red")) +
  #geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  #geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Seasonality Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
season_mae <- mae(performance_season$y, performance_season$yhat)
print(paste("The MAE for the seasonality model is", season_mae))
[1] "The MAE for the seasonality model is 104.092782042215"
Code
# Check MASE value
season_mase <- mase(performance_season$y, performance_season$yhat)
print(paste("The MASE for the seasonality model is", season_mase))
[1] "The MASE for the seasonality model is 1.70416019665511"

8. Multivariate Model

8.1 Build the Multivariate Model

Code
train_data2 <- train_data %>% 
  select(-c(ds, y))

features <- colnames(train_data2)

# Add seasonality
model_multivariate <- prophet(train_data, fit = FALSE)

# Add all regressors
for (name in features){
   model_multivariate <- add_regressor(model_multivariate, name)
}

# Fit on training set
multi_fit <- fit.prophet(model_multivariate, train_data)

8.2 Multivariate Model Forecast

Code
# Create time range for forecast
future_regressor <- make_future_dataframe(multi_fit, freq = "week", periods = 8)

# Append the regressor values
future_regressor <- full_join(future_regressor, train_data, by = join_by(ds == ds))


# Make prediction
forecast_regressor <- predict(multi_fit, selected_deaths)

# Visualize the forecast
plot(multi_fit, forecast_regressor) +
  labs(title = "Regressor Prophet Model")

Code
prophet_plot_components(multi_fit, forecast_regressor)

8.3 Multivariate Model Performance

Code
# Merge actual and predicted values
forecast_regressor2 <- forecast_regressor %>%
  filter(ds > as.POSIXct("2023-01-01")) %>% 
  select(ds, yhat) %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>% 
  mutate(index = seq(1:332))

performance_regressor <- full_join(test_data, forecast_regressor2, by = join_by(index == index, ds == ds)) %>%
  select(ds, y, yhat, index)

performance_regressor <- performance_regressor %>% 
  mutate(yhat = case_when((yhat < 0) ~ 0,
                          .default = yhat))

# performance_regressor <- performance_regressor %>% 
#   mutate(y = exp(y),
#          yhat = exp(yhat))

# Plotting Test
ggplot(performance_regressor) +
  geom_line(aes(x = index, y = y, color = "orange")) +
  # geom_point(aes(x = index, y = y, color = "red")) +
  geom_line(aes(x = index, y = yhat, color = "blue")) +
  #geom_point(aes(x = index, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("orange", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Regressor Prophet Test Set", 
       y = "Deaths",
       x = "Dates")

Code
# Check MAE value
regressor_mae <- mae(performance_regressor$y, performance_regressor$yhat)
print(paste("The MAE for the regressor model is", regressor_mae))
[1] "The MAE for the regressor model is 47.2132331706774"
Code
# Check MASE value
regressor_mase <- mase(performance_regressor$y, performance_regressor$yhat)
print(paste("The MASE for the regressor model is", regressor_mase))
[1] "The MASE for the regressor model is 0.77295381241934"
Code
# Check MAPE value
regressor_mape <- MAPE(performance_regressor$yhat, performance_regressor$y)
print(paste("The MAPE for the regressor model is", regressor_mape))
[1] "The MAPE for the regressor model is 0.880915064442601"

9. Holiday and Event Effect

9.1 Build Model with Holiday and Event

Code
events <- tribble(
  ~holiday, ~ds, ~lower_window, ~upper_window,
  #--|--|----
  "COVID", as.Date('2020-03-14'), -15, 15,
  "superbowl", as.Date('2020-02-02'), -7, 7,
  "superbowl", as.Date('2021-02-07'), -7, 7,
  "superbowl", as.Date('2022-02-13'), -7, 7,
  "superbowl", as.Date('2023-02-12'), -7, 7,
)

events
# A tibble: 5 × 4
  holiday   ds         lower_window upper_window
  <chr>     <date>            <dbl>        <dbl>
1 COVID     2020-03-14          -15           15
2 superbowl 2020-02-02           -7            7
3 superbowl 2021-02-07           -7            7
4 superbowl 2022-02-13           -7            7
5 superbowl 2023-02-12           -7            7
Code
# Add holidays 
model_holiday <- prophet(train_data, holidays = events, n.changepoints = 3, fit = FALSE)

# Add built-in country-specific holidays
model_holiday <- add_country_holidays(model_holiday, country_name = 'US')

# Fit model on training
holiday_fit <- fit.prophet(model_holiday, train_data)

9.2 Holiday Model Forecast

Code
# Create time range for forecast
future_holiday <- make_future_dataframe(holiday_fit, freq = "week", periods = 8)

# Append the regressor values
future_holiday <- left_join(future_holiday, test_data, by = join_by(ds == ds))

# Make prediction
forecast_holiday <- predict(holiday_fit, future_holiday)

# Visualize the forecast
plot(holiday_fit, forecast_holiday) +
  labs(title = "Holiday Prophet Model")

Code
# Visualize the forecast components
prophet_plot_components(holiday_fit, forecast_holiday)

9.3 Holiday Model Performance

Code
# Merge actual and predicted values
forecast_holiday2 <- forecast_holiday %>%
  filter(ds > as.POSIXct("2023-01-01")) %>%
  select(ds, yhat) %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>% 
  mutate(index = seq(1:332))

performance_holiday <- full_join(test_data, forecast_holiday2, by = join_by(ds == ds, index == index)) %>% 
  select(ds, y, yhat)

# performance_holiday <- performance_holiday %>% 
#   mutate(y = exp(y),
#          yhat = exp(yhat))

ggplot(performance_holiday) +
  geom_line(aes(x = ds, y = y, color = "red")) +
  geom_point(aes(x = ds, y = y, color = "red")) +
  geom_line(aes(x = ds, y = yhat, color = "blue")) +
  geom_point(aes(x = ds, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("red", "blue"), labels = c("Predicted", "Actual")) +
  labs(title = "Holiday Prophet Test Set", 
       y = "Deaths",
       x = "Dates") 

Code
# Check MAE value
holiday_mae <- mae(performance_holiday$y, performance_holiday$yhat)
print(paste("The MAE for the holiday model is", holiday_mae))
[1] "The MAE for the holiday model is 82.8681908580654"
Code
# Check MASE value
holiday_mase <- mase(performance_holiday$y, performance_holiday$yhat)
print(paste("The MASE for the holiday model is", holiday_mase))
[1] "The MASE for the holiday model is 1.3566807386497"
Code
# Check MAPE value
# holiday_mape <- MAPE(performance_holiday$yhat, performance_holiday$y)
# print(paste("The MAPE for the holiday model is", holiday_mape))

10 Putting Best Parameters Together

Code
# logging values
train_data <- train_data %>% 
  mutate(y = log(y))

test_data <- test_data %>% 
  mutate(y = log(y))

# Adding seasonality, holidays, changepoints
best_prophet <- prophet(train_data,
                        yearly.seasonality = TRUE,
                        weekly.seasonality = TRUE,
                        holidays = events,
                        n.changepoints = 3,
                        fit = FALSE)

best_prophet <- add_country_holidays(best_prophet, country_name = 'US')

# Adding features
for (name in features){
   best_prophet <- add_regressor(best_prophet, name)
}

# Fit model
best_fit <- fit.prophet(best_prophet, train_data)

10.1 Forecasting Best Parameters

Code
# Create time range for forecast
future_best <- make_future_dataframe(best_fit, freq = "week", periods = 8)

# Append the regressor values
future_best <- left_join(future_best, train_data, by = join_by(ds == ds))

# Make prediction
forecast_best <- predict(best_fit, selected_deaths)

# unlog
forecast_best1 <- forecast_best %>% 
  mutate(yhat = exp(yhat))

# Visualize the forecast
plot(best_fit, forecast_best) +
 labs(title = "Best Prophet Model")

Code
forecast_best2 <- forecast_best %>% 
  mutate(yhat = exp(yhat),
         ds = as.Date(ds)) %>% 
  select(ds, yhat) %>% 
  mutate(index = seq(1:7306))

selected_deaths2 <- selected_deaths %>% 
  select(ds, y)  %>% 
  mutate(index = seq(1:7306))

best_predictions <- full_join(forecast_best2, selected_deaths2, join_by(ds == ds, index == index))

save(best_predictions, file = "results/best_multi_prophet_pred.rda")

best_predictions2 <- best_predictions %>% 
  filter(ds >= as.Date("2023-01-01"))

ggplot() +
  geom_line(data = best_predictions2, aes(x = index, y = y, color = "orange"), alpha = 0.55) +
  geom_line(data = best_predictions2, aes(x = index, y = yhat, color = "blue"), alpha = 0.85) +
  scale_color_manual(name="Value", values = c("orange", "blue"), labels = c("Predicted", "Actual")) +
  scale_x_continuous(breaks = c(7000, 7150, 7300),
                     labels = c("Jan 2023", "Feb 2023", "March 2023")) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  labs(x = NULL,
       y = NULL) +
  theme(aspect.ratio=0.2) 

10.2 Best Parameters Performance

Code
# Merge actual and predicted values
forecast_best2 <- forecast_best %>%
  filter(ds > as.POSIXct("2023-01-01")) %>% 
  select(ds, yhat) %>% 
  mutate(index = seq(1:332))

test_data <- test_data %>% 
  mutate(index = seq(1:332))

performance_best <- full_join(test_data, forecast_best2, by = join_by(index == index, ds == ds)) %>%
  select(ds, y, yhat, index)

performance_best <- performance_best %>% 
  mutate(y = exp(y),
         yhat = exp(yhat))
Code
best_test_plot <- ggplot(performance_best) +
  geom_line(aes(x = index, y = y, color = "orange")) +
  #geom_point(aes(x = index, y = y, color = "red")) +
  geom_line(aes(x = index, y = yhat, color = "blue")) +
  #geom_point(aes(x = index, y = yhat, color = "blue")) +
  scale_color_manual(name="Value", values = c("orange", "blue"), labels = c("Predicted", "Actual")) +
  scale_x_continuous(breaks = c(75, 185, 300),
                     labels = c("Jan 2023", "Feb 2023", "March 2023")) +
  labs(title = "Best Prophet Test Set", 
       y = "Deaths",
       x = "Dates") +
  coord_fixed(ratio = 0.2) +
  theme(legend.title = element_blank())
best_test_plot

Code
# Check MAE value
best_mae <- mae(performance_best$y, performance_best$yhat)
print(paste("The MAE for the best model is", best_mae))
[1] "The MAE for the best model is 30.8251140866864"
Code
# Check MASE value
best_mase <- mase(performance_best$y, performance_best$yhat)
print(paste("The MASE for the best model is", best_mase))
[1] "The MASE for the best model is 0.504654899727629"
Code
# Check MAPE value
best_mape <- MAPE(performance_best$yhat, performance_best$y)
print(paste("The MAPE for the best model is", best_mape))
[1] "The MAPE for the best model is 0.440957849425632"
Code
# Printing All of them together
results <- tribble(
  ~model, ~mae, ~mase,
  #--|--|----
  "baseline", baseline_mae, baseline_mase,
  "changepoint", changepoint_mae, changepoint_mase,
  "holiday", holiday_mae, holiday_mase,
  "regressors", regressor_mae, regressor_mase,
  "season", season_mae, season_mase,
  "best", best_mae, best_mase
) %>% 
  kbl() %>% 
  kable_styling()

results
model mae mase
baseline 109.39576 1.7909781
changepoint 80.69683 1.3211322
holiday 82.86819 1.3566807
regressors 47.21323 0.7729538
season 104.09278 1.7041602
best 30.82511 0.5046549
Code
# save best model
save(results, best_fit, performance_best, forecast_best, best_mase, best_mae,
     file = "results/multi_prophet.rda")